By: Alan Zhang
To do list:
-- Explanation on technological improvements. See 3A)
-- Court differences 3A)
3A. Trends over time
4a. Matrix Plot
4b. Backwards Selection
-- Write intro here--
I did an analysis of ATP Tour matches to learn statistics, computer science, and to learn more about a sport I'm passionate about. I scraped data off of the internet, not downloading a single thing, and did statistical analysis on my favorite players. Working with the data has shown interesting trends over the history of tennis ---
We want to set up our data to look at specific statistics such as how stats differ for losers and winners on different courts, as well as statistics of top players over time.
Our code first sets up arrays to store these statistics, and then scrapes the data off of a publicly available webpage. I used this dataset: https://github.com/JeffSackmann/tennis_atp/blob/master/README.md
Since the dataset is stored as matches per year, I conglomerated all the years into one big dataset so I could analyze over time. While conglomerating, I extracted yearly averages of statistics I specifically wanted to look at.
# Necessary Imports
import pandas as pd
import datetime
import matplotlib.pyplot as plt
# Defining date parser for our data
def parse(t):
ret = []
for ts in t:
try:
string = str(ts)
tsdt = datetime.date(int(string[:4]), int(string[4:6]), int(string[6:]))
except TypeError:
tsdt = datetime.date(1900,1,1)
ret.append(tsdt)
return ret
# Read in the first year of the data
df = pd.read_csv("https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_1968.csv", index_col=None,
header=0,
parse_dates=[5],
encoding = "ISO-8859-1",
date_parser=lambda t:parse(t))
# Create lists to store yearly aces of winners
yearlyAces = []
yearlyAces.append(df['w_ace'].mean())
# Create lists to store yearly aces of winners on different courts
yearlyAcesHard = []
yearlyAcesGrass = []
yearlyAcesClay = []
# Cut the dataframes on different courts
hardSurface = df.loc[df['surface'] == "Hard"]
grass = df.loc[df['surface'] == "Grass"]
clay = df.loc[df['surface'] == "Clay"]
# Add the first average yearly winners aces on different courts
yearlyAcesHard.append(hardSurface['w_ace'].mean())
yearlyAcesGrass.append(grass['w_ace'].mean())
yearlyAcesClay.append(clay['w_ace'].mean())
# Same thing for losers
yearlyLosersAces = []
yearlyLosersAces.append(df['l_ace'].mean())
hardSurface = df.loc[df['surface'] == "Hard"]
grass = df.loc[df['surface'] == "Grass"]
clay = df.loc[df['surface'] == "Clay"]
yearlyAcesHardL = []
yearlyAcesGrassL = []
yearlyAcesClayL = []
yearlyAcesHardL.append(hardSurface['l_ace'].mean())
yearlyAcesGrassL.append(grass['l_ace'].mean())
yearlyAcesClayL.append(clay['l_ace'].mean())
# Setting up data to look at the number of US players who finished in the top 4
USnumOfTop4 = []
top4 = df.loc[(df['round'] == "F") | (df['round'] == "SF") | (df['tourney_level'] == "G")]
USnumOfTop4.append(top4['winner_ioc'].value_counts()["USA"])
# Setting up data to look at the number of AUS, ESP, and FRA players who finished in the top 4
AUSnumOfTop4 = []
AUSnumOfTop4.append(top4['winner_ioc'].value_counts()["AUS"])
ESPnumOfTop4 = []
ESPnumOfTop4.append(top4['winner_ioc'].value_counts()["ESP"])
FRAnumOfTop4 = []
FRAnumOfTop4.append(top4['winner_ioc'].value_counts()["FRA"])
averageWinnersHeight = []
averageWinnersHeight.append(df['winner_ht'].mean())
averageLosersHeight = []
averageLosersHeight.append(df['loser_ht'].mean())
top8 = df.loc[(df['round'] == "F") | (df['round'] == "SF") | (df['tourney_level'] == "G") | (df['round'] == "QF")]
averageWinnersHeightTop8 = []
averageWinnersHeightTop8.append(top8['winner_ht'].mean())
averageLosersHeightTop8 = []
averageLosersHeightTop8.append(top8['loser_ht'].mean())
averageWinnersAgeTop8 = []
averageWinnersAgeTop8.append(top8['winner_age'].mean())
averageLosersAgeTop8 = []
averageLosersAgeTop8.append(top8['loser_age'].mean())
doubleFaultsW = []
doubleFaultsW.append(df['w_df'].mean())
doubleFaultsL = []
doubleFaultsL.append(df['l_df'].mean())
doubleFaultsWTop8 = []
doubleFaultsWTop8.append(top8['w_df'].mean())
doubleFaultsLTop8 =[]
doubleFaultsLTop8.append(top8['l_df'].mean())
for i in range(1969, 2022):
url = "https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_" + str(i) + ".csv"
df1 = pd.read_csv(url, index_col=None,
header=0,
parse_dates=[5],
encoding = "ISO-8859-1",
date_parser=lambda t:parse(t))
df = pd.concat([df,df1])
yearlyAces.append(df1['w_ace'].mean())
hardSurface = df1.loc[df1['surface'] == "Hard"]
grass = df1.loc[df1['surface'] == "Grass"]
clay = df1.loc[df1['surface'] == "Clay"]
yearlyAcesHard.append(hardSurface['w_ace'].mean())
yearlyAcesGrass.append(grass['w_ace'].mean())
yearlyAcesClay.append(clay['w_ace'].mean())
yearlyLosersAces.append(df1['l_ace'].mean())
yearlyAcesHardL.append(hardSurface['l_ace'].mean())
yearlyAcesGrassL.append(grass['l_ace'].mean())
yearlyAcesClayL.append(clay['l_ace'].mean())
top4 = df1.loc[(df1['round'] == "F") | (df1['round'] == "SF") | (df1['tourney_level'] == "G")]
USnumOfTop4.append(top4['winner_ioc'].value_counts()["USA"])
AUSnumOfTop4.append(top4['winner_ioc'].value_counts()["AUS"])
ESPnumOfTop4.append(top4['winner_ioc'].value_counts()["ESP"])
FRAnumOfTop4.append(top4['winner_ioc'].value_counts()["FRA"])
averageWinnersHeight.append(df1['winner_ht'].mean())
averageLosersHeight.append(df1['loser_ht'].mean())
top8 = df.loc[(df['round'] == "F") | (df['round'] == "SF") | (df['tourney_level'] == "G") | (df['round'] == "QF")]
averageWinnersHeightTop8.append(top8['winner_ht'].mean())
averageLosersHeightTop8.append(top8['loser_ht'].mean())
averageWinnersAgeTop8.append(top8['winner_age'].mean())
averageLosersAgeTop8.append(top8['loser_age'].mean())
doubleFaultsW.append(df1['w_df'].mean())
doubleFaultsL.append(df1['l_df'].mean())
doubleFaultsWTop8.append(top8['w_df'].mean())
doubleFaultsLTop8.append(top8['l_df'].mean())
combined = df
display(combined)
After creating our big dataset and extracting info, here I graph the statsiscs I collected to observe some trends.
years = range(1968,2022)
plt.plot(years, yearlyLosersAces, label = "Total")
plt.xlabel("Years")
plt.ylabel("Average Number of Winner's Aces")
plt.title("Number of Winner's Aces Over Time")
plt.plot(years, yearlyAcesHard, label = "Hard")
plt.plot(years, yearlyAcesGrass, label = "Grass")
plt.plot(years, yearlyAcesClay, label = "Clay")
plt.legend()
This is a graph of the number of aces a winner scores over time on different courts. As we can see, the general trend is upwards. This could be attributed to a multitude of reasons.
One such reason would be equipment improvements ---- or skill at receiving lags behind faster serves because players need to recieve
Another trend we can see is that aces are much more common on grass, and more rare on clay courts. --- Insert some science behind why clay is better traction so people get to the ball faster?
One outlier to note is the spike at 2020. Due to covid, there was only 2 games played on grass that year, the spike at 2020 is not useful.
plt.plot(years, yearlyLosersAces, label = "Total")
plt.xlabel("Years")
plt.ylabel("Average Number of Loser's Aces")
plt.title("Number of Loser's Aces Over Time")
plt.plot(years, yearlyAcesHardL, label = "Hard")
plt.plot(years, yearlyAcesGrassL, label = "Grass")
plt.plot(years, yearlyAcesClayL, label = "Clay")
plt.legend()
Here, we plot the same graph as above, but for the losing side of each match. We can see the same trends with courts and the number of aces going up, but the overall number of aces per match is lower than the winners which is to be expected.
Although this graph doesn't give us any new information it's reassuring to know that the data makes sense, losers are have worse stats than winners.
Next, I was interested in how strong the top regions were from around the world. I decided to measure this by seeing how many players from each top region made it to the top 4 in Grand Slam tournaments.
# Where good players come from over time
plt.plot(years, USnumOfTop4, label = 'US Players')
plt.plot(years, AUSnumOfTop4, label = 'Australian Players')
plt.plot(years, ESPnumOfTop4, label = 'Spain Players')
plt.plot(years, FRAnumOfTop4, label = 'France Players')
plt.title("Number of Players From Top regions Who made it to Top 4 in Grand Slams")
plt.xlabel("Years")
plt.ylabel("Number of Players At top 4 in Grand Slams")
plt.legend()
Looking the graph, the trend is clear. The us used to dominate in early years, but nearing the turn of the century, France and Spain in particular overtook us in the number of athletes making it into top 4 at grand slams.
-- Insert speculation as to why here --
A general interest in sports is the physical makeup of top athelets. What makes a good basketball player? The obvious answer is height. I wondered this about tennis so I analyzed the height and weight of winners and losers of every match over time.
# Heights of Losers VS Winners Over Time
print(averageWinnersHeight)
plt.plot(years, averageWinnersHeight, label = 'Winners')
plt.plot(years, averageLosersHeight, label = 'Losers')
plt.title("Heights of Winners and Losers over Time")
plt.xlabel("Years")
plt.ylabel("Average Height (CM)")
plt.legend()
Since I'm plotting the averages of many many matches, the differnce could be negligble, but the graph shows that winners are generally taller, which could lead some insight into height giving a slight advantage while playing tennis.
To narrow down the dataset a little and look at the peak of the sport, I decided to look at the heights of the Top 8 only to see if height mattered more or less at the highest level of competition.
# Heights of Losers VS Winners Over Time In Top 8
plt.plot(years, averageWinnersHeightTop8, label = 'Winners')
plt.plot(years, averageLosersHeightTop8, label = 'Losers')
plt.title("Heights of Winners and Losers in Top 8 over Time")
plt.xlabel("Years")
plt.ylabel("Average Height (CM)")
plt.legend()
-- Why this could be --
On a similar note, age has always played different factors in different sports. I wondered how much age mattered in tennis so I plotted it.
# Ages of Losers VS Winners Over Time
plt.plot(years, averageWinnersAgeTop8, label = 'Winners')
plt.plot(years, averageLosersAgeTop8, label = 'Losers')
plt.title("Ages of Winners and Losers over Time")
plt.xlabel("Years")
plt.ylabel("Average Age (Years)")
plt.legend()
Although the graph looks like there is a trend, that players became younger, the age range is so small, most players being within 25.5 to 27 years old. This graph tells us very little.
-- Can find historical evidence of tennis being an old persons game --
Diving more into specific statistics, I was interested in the number of double faults over time, as I thought it should just get better over time as players become more consistent.
# Double Faults of Losers VS Winners Over Time
plt.plot(years, doubleFaultsW, label = 'Winners')
plt.plot(years, doubleFaultsL, label = 'Losers')
plt.title("Double Faults of Winners and Losers over Time")
plt.xlabel("Years")
plt.ylabel("Average Double Faults")
plt.legend()
There are three things to analyze here.
From the graph we can see that the losers distribution is the same as the winners but shifted up by 0.8.
The clear takeaway is that the losers line is shifted up; losers have more double faults, because double faults contribute to a loss.
An interesting and not so clear takeaway from this graph is why the loss distribution looks so similar to the win distribution. It could have something to do with the fact that we are comparing winners and losers, and with the way we conglomerated our data, a player can contribute to both the winners and losers average.
The last takeaway would come from analyzing the trend over time, why both losers and winners started serving more double faults around 1997, then trending down, and back up.
-- Speculation -- Around 2005-6 ish people wanted more consistent serves, not giving up free points. After acheiving consistency, people tried to gain speed back, playing more high risk high rewards serves, resulting in more double faults.
Since this trend was so interesting, I decided to take a closer look at the highest level of competition. Here is a graph of double faults of winners vs losers from top 8 matches.
# Double Faults of Losers VS Winners Over Time In top 8 Mathches
plt.plot(years, doubleFaultsWTop8, label = 'Winners')
plt.plot(years, doubleFaultsLTop8, label = 'Losers')
plt.title("Double Faults of Winners and Losers in Top 8 over Time")
plt.xlabel("Years")
plt.ylabel("Average Double Faults")
plt.legend()
From this graph the trend is even more exact. Losers of top 8 matches serve 0.8 to 0.9 more double faults than winners of these mathces. It is very confusing why it is so exact.
I decided to try and model what makes a good player by predicting player winrate from the available statistics. The first step in doing this is extracting each players's data from the match data.
listOfPlayers = combined['winner_name'].unique()
winCounts = combined['winner_name'].value_counts()
lossCounts = combined['loser_name'].value_counts()
winRates = []
for player in listOfPlayers:
totalGames = winCounts.get(player, default = 0) + lossCounts.get(player, default = 0)
# We filter out people who have played less than 100 games
if totalGames < 100:
continue
else:
wins = combined.loc[(combined['winner_name'] == player)]
winAvgs = wins.mean()
losses = combined.loc[(combined['loser_name'] == player)]
lossAvgs = losses.mean()
winRates.append((player, winCounts.get(player, default = 0)/totalGames, winAvgs['minutes'], winAvgs['w_ace'], winAvgs['w_df'], winAvgs['w_svpt'] \
, winAvgs['w_1stIn'], winAvgs['w_1stWon'], winAvgs['w_2ndWon'], winAvgs['w_SvGms'], winAvgs['w_bpSaved'], winAvgs['w_bpFaced'], lossAvgs['minutes']\
, lossAvgs['l_ace'], lossAvgs['l_df'], lossAvgs['l_svpt'], lossAvgs['l_1stIn'], lossAvgs['l_1stWon'], lossAvgs['l_2ndWon'], lossAvgs['l_SvGms']\
, lossAvgs['l_bpSaved'], lossAvgs['l_bpFaced']))
Now I can combine this all into its own datatframe
playersDataframe = pd.DataFrame.from_records(winRates, columns =['Player', 'Winrate','Winning Minutes', 'Winning Aces' , 'Winning Double Fault', 'Winning Serve Point', 'Winning First In', 'Winning First Point Won',\
'Winning Second Point Won', 'Winning Serve Games', 'Winning Break Points Saved', 'Winning Break Points Faced','Losing Minutes','Losing Aces' ,\
'Losing Double Fault', 'Losing Serve Point', 'Losing First In', 'Losing First Point Won',\
'Losing Second Point Won', 'Losing Serve Games', 'Losing Break Points Saved', 'Losing Break Points Faced'])
playersDataframe.dropna(how = 'any', inplace = True)
display(playersDataframe)
Now I can do a preliminarly matrix plot to see if anyting is correlated with winrate well
import seaborn as sns
predictors = []
sns.pairplot(playersDataframe)
In this matrix plot, we would be looking for any variables that are correlated with winrate, but not correlated with each other so when we use them in the model, they do not explain the same variance.
However, looking at the diagonal of our matrix plot, we can see that most of our stats look normally distributed. This is a problem, because if we look at our scatter plots, none of our variables are correlated with winrate very well. This does not bode well for the model.
Nevertheless, I will attempt to continue with the model starting with a multiple regression model against all our statistics.
import statsmodels.api as sm
X = playersDataframe[['Winning Minutes', 'Winning Aces' , 'Winning Double Fault', 'Winning Serve Point', 'Winning First In', 'Winning First Point Won',\
'Winning Second Point Won', 'Winning Serve Games', 'Winning Break Points Saved', 'Winning Break Points Faced','Losing Minutes','Losing Aces' ,\
'Losing Double Fault', 'Losing Serve Point', 'Losing First In', 'Losing First Point Won',\
'Losing Second Point Won', 'Losing Serve Games', 'Losing Break Points Saved', 'Losing Break Points Faced']]
y = playersDataframe['Winrate']
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
est.summary()
I'm going to use backward selection in an attempt to improve my model.
Backwards selection means I will try and prune high p-value predicotrs out of the model as the high p-value shows that it adds little to the model. However, the model looks doomed because of the low R-Squared value.
In spite of this, we will continue.
X = playersDataframe[['Winning Minutes','Winning Double Fault', 'Winning Serve Games', 'Winning Break Points Saved', 'Winning Break Points Faced','Losing Minutes',\
'Losing Double Fault', 'Losing Serve Point', 'Losing First Point Won',\
'Losing Second Point Won', 'Losing Serve Games', 'Losing Break Points Saved', 'Losing Break Points Faced']]
y = playersDataframe['Winrate']
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
est.summary()
Looking at the coefficents, they make sense. For example, winning minutes has a negative coefficient, as taking longer to win your games may mean you are a less dominant player in your games, thus having a lower winrate.
However, our R-Sqaured value is quite low, making this a bad model. This could be a result of tennis statistics being normally distributed, and looking at our matrix plot, it's hard to see correlations when all the points are bunched together.
We will continue selecting only the low p-value predcitors. With an alpha (significance level) of 0.05, there are only 2 predictors that satifsy this alpha.
X = playersDataframe[['Winning Minutes','Losing Minutes']]
y = playersDataframe['Winrate']
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
est.summary()
liminating all of the predictors with p-values higher than a signfiicance level of 0.05 leaves us with just two predictors. Funnily enough, this tanks our R-squared making our already terrible model worse.
Looking at the scatter plots, I was already skeptical that a linear model would work to predict winrates. I was somewhat optimistic though, playing the game of tennis myself because I thought stats could show what made good players good, and what stats I could focus on myself to improve my winrate.
The only results that make sense are that the faster you win your games, the better your winrate. Similarly, the longer it takes you to lose, the better your winrate.
Conclusion. Currently, with publicly available stats, I can't make a linear model to predict wins. It seems like the stats people are choosing to document don't really matter. Maybe with more sophsticated stats such as shot selection, or percentage in that professional currently use, I could get more meaningful results.
After all this doom and gloom, I wanted to satisfy my fan perspective, comparing the best of the best players against each other. I decided to compare Federer, Nadal, and Novak.
# Analyze a single player's winrate over time
# let's look at how novak's stats in wins have changed yearly
novak = combined.loc[(combined['winner_name'] == 'Novak Djokovic')]
nadal = combined.loc[(combined['winner_name'] == 'Rafael Nadal')]
federer = combined.loc[(combined['winner_name'] == 'Roger Federer')]
# Initialize stats array
novakStatsOverTime = []
nadalStatsOverTime = []
federerStatsOverTime = []
for year in range(2000,2022):
wins = novak.loc[novak['tourney_date'] < datetime.datetime(year,1,1)]
winAvgs = wins.mean()
novakStatsOverTime.append((year, winAvgs['minutes'], winAvgs['w_ace'], winAvgs['w_df'], winAvgs['w_svpt'] \
, winAvgs['w_1stIn'], winAvgs['w_1stWon'], winAvgs['w_2ndWon'], winAvgs['w_SvGms'], winAvgs['w_bpSaved'], winAvgs['w_bpFaced']))
wins = nadal.loc[nadal['tourney_date'] < datetime.datetime(year,1,1)]
winAvgs = wins.mean()
nadalStatsOverTime.append((year, winAvgs['minutes'], winAvgs['w_ace'], winAvgs['w_df'], winAvgs['w_svpt'] \
, winAvgs['w_1stIn'], winAvgs['w_1stWon'], winAvgs['w_2ndWon'], winAvgs['w_SvGms'], winAvgs['w_bpSaved'], winAvgs['w_bpFaced']))
wins = federer.loc[federer['tourney_date'] < datetime.datetime(year,1,1)]
winAvgs = wins.mean()
federerStatsOverTime.append((year, winAvgs['minutes'], winAvgs['w_ace'], winAvgs['w_df'], winAvgs['w_svpt'] \
, winAvgs['w_1stIn'], winAvgs['w_1stWon'], winAvgs['w_2ndWon'], winAvgs['w_SvGms'], winAvgs['w_bpSaved'], winAvgs['w_bpFaced']))
novakOverTime = pd.DataFrame.from_records(novakStatsOverTime, columns =['Year','Winning Minutes', 'Winning Aces' , 'Winning Double Fault', 'Winning Serve Point', 'Winning First In', 'Winning First Point Won',\
'Winning Second Point Won', 'Winning Serve Games', 'Winning Break Points Saved', 'Winning Break Points Faced'])
nadalOverTime = pd.DataFrame.from_records(nadalStatsOverTime, columns =['Year','Winning Minutes', 'Winning Aces' , 'Winning Double Fault', 'Winning Serve Point', 'Winning First In', 'Winning First Point Won',\
'Winning Second Point Won', 'Winning Serve Games', 'Winning Break Points Saved', 'Winning Break Points Faced'])
federerOverTime = pd.DataFrame.from_records(federerStatsOverTime, columns =['Year','Winning Minutes', 'Winning Aces' , 'Winning Double Fault', 'Winning Serve Point', 'Winning First In', 'Winning First Point Won',\
'Winning Second Point Won', 'Winning Serve Games', 'Winning Break Points Saved', 'Winning Break Points Faced'])
plt.plot(novakOverTime['Year'], novakOverTime["Winning Serve Point"], label = 'Novak')
plt.plot(nadalOverTime['Year'], nadalOverTime["Winning Serve Point"], label = 'Nadal')
plt.plot(federerOverTime['Year'], federerOverTime["Winning Serve Point"], label = 'Federer')
plt.xlabel("Years")
plt.ylabel("")
plt.title("Number of Minutes to Win over Time")
plt.legend()
%%shell
jupyter nbconvert --to html Alan_Final_Project.ipynb